#######------PGMs and Conflict Data Construction------#######

#This file is only for creating the data frame 
    #and visualizations of the data (maps and whatnot)

# Building the DF ---------------------------------------------------------
rm(list = ls()) 
library(dplyr)
library(tidyr)
library(countrycode)
#Set working directory
getwd()
#dir.create("/Users/benjaminleo/Downloads/PGMs_and_CW_onset_Empirics")
setwd("/Users/benjaminleo/Downloads/PGMs_and_CW_onset_Empirics")

#Loading the PGM dataset 
#install.packages("dataverse")
library(dataverse)
#install.packages("bit")
library(bit)
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
pgms <- get_dataframe_by_name(
  filename = "pgmdv2_countryyear.tab",
  dataset = "doi:10.7910/DVN/YK8L4I",
  server = "dataverse.harvard.edu"
)
View(pgms)

#Using peacescienceR to create a dataframe with most of the control variables  
#install.packages("peacesciencer")
library(peacesciencer)
cow_df <- create_stateyears(system = "cow") 
View(cow_df)

#adding in data 
cow_df <- add_democracy(cow_df)
cow_df <- add_rugged_terrain(cow_df)
cow_df <- add_cow_majors(cow_df)
cow_df <- add_sdp_gdp(cow_df)
cow_df <- add_creg_fractionalization(cow_df)
cow_df <- add_nmc(cow_df)
cow_df <- add_gwcode_to_cow(cow_df)
cow_df <- add_ucdp_acd(cow_df, c("intrastate","||"))


#Time since conflict variable 
library(stevemisc)
cow_df <- ps_btscs(cow_df, ucdpongoing, year, ccode)
summary(cow_df$spell)

cow_df <- cow_df |>
  rename(ts_war = spell)

#Reducing the temporal zone to the apropriate time period 
cow_df <- cow_df |>
  filter(year >= 1981 & year <= 2014)
cow_df <- unique(cow_df)

#Taking a look at this dataframe 
summary(cow_df)


#Loading in coup data  
coups <- read.delim('http://www.uky.edu/~clthyn2/coup_data/powell_thyne_coups_final.txt')
View(coups)
# Assuming coups is your dataframe
coups <- coups|>
  mutate(coup_attempt = 1)
coups <- select(coups, -c("ccode_gw", 
                          "ccode_polity", 
                          "month", 
                          "day", 
                          "version"))
#correct dataframe 
coups <- coups |>
  filter(year >= 1981 & year <= 2014)

#Need to get this to country year 
coups <- distinct(coups, ccode, year, .keep_all = TRUE)


#EPR data - better data on ethnic exlusion than CREG
#Cant use Selway data, ends in 1999 - kills 15 years of obs 
epr <- read.delim("https://icr.ethz.ch/data/epr/core/EPR-2021.txt")
View(epr)

#I need to create an indicator for the level of ethnic inequality by state
#this is going to be a bit of a pain
epr <- epr |>
  rename(country = statename)

#need to first make group year the UOA
expanded_epr <- epr |>
  group_by(group, gwid) |>
  summarise(from = min(from), to = max(to)) |>
  uncount(to - from + 1, .id = "index") |>
  mutate(year = from + index - 1) |>
  select(-from, -to, -index) |>
  left_join(epr, by = c("group", "gwid"))

View(expanded_epr)


#Now we need to sort by country year, and add indicators for ethnic exclusion
# Create the new variable ethnic_exclus based on status
expanded_epr$ethnic_exclus <- ifelse(expanded_epr$status %in% c("DISCRIMINATED", "POWERLESS"), 1, 0)
hist(expanded_epr$ethnic_exclus)


# Group by country and year and collapse observations
collapsed_epr <- expanded_epr |>
  group_by(gwid, year) |>
  summarise(
    ethnic_exclus = as.numeric(any(ethnic_exclus == 1)),
    total_size = sum(size),
    num_groups = sum(ethnic_exclus)
  )


#I need an interaction term between presence and group size 
collapsed_epr$size_exclu_int <- collapsed_epr$num_groups * collapsed_epr$total_size
hist(collapsed_epr$size_exclu_int)


#checking everything out 
hist(collapsed_epr$ethnic_exclus)
hist(collapsed_epr$num_groups)
rm(expanded_epr, epr)
View(collapsed_epr)



#set the right timeframe 
collapsed_epr <- collapsed_epr |>
  filter(year < 2015,
         year > 1980)

#good to go 

#World Bank database to get some of the control variables that arent in psr
#install.packages("wbstats")
library("wbstats")
#install.packages("janitor")
library("janitor")

#getting the data 
#oil <- wb_search(pattern = "Oil rents" , fields = "indicator")
#print(oil)
#the code for Oil rents (% of GDP) is NY.GDP.PETR.RT.ZS
#internet_oil <- wb_data(indicator - "NY.GDP.PETR.RT.ZS", "countries_only")
#internet <- wb_search(pattern = "population" , fields = "indicator")
# code for population total is SP.POP.TOTL
#using wbstats to pull the data in 
#store the list of indicators in an object 
indicators <- c("oilrents" = "NY.GDP.PETR.RT.ZS", 
                "population" = "SP.POP.TOTL", 
                "Total natural resources rents " = "NY.GDP.TOTL.RT.ZS", 
                "population density" = "EN.POP.DNST", 
                "primary school enrollment" = "SE.PRM.NENR", 
                "gdpgrowth" = "NY.GDP.MKTP.KD.ZG")
#download the data 
wb_data <- wb_data(indicators ,
                   mrv = 45) %>%
  select(!iso2c) %>%
  rename(year = date)
rm(indicators)

#Getting the data ready to merge 
wb_data <- wb_data %>%
  filter(year >= 1981 & year <= 2014)
View(wb_data)
#A little bit of work needs to be done to get the WBdata ready to merge 
#Switching iso3c codes to cowcodes 
library(countrycode)
ccode_dta <- codelist_panel
View(ccode_dta)
ccode_dta <- ccode_dta %>% 
  filter(year >= 1981 & year <= 2014)
ccode_dta <- ccode_dta[, c( "year", 
                            "continent", 
                            "cown", 
                            "region",
                            "country.name.en", 
                            "iso3c")]

#adding ccodes to the WB data 
wb_full_data <- left_join(wb_data, ccode_dta, by = c("year" = "year", 
                                                     "iso3c" = "iso3c"))
View(wb_full_data)
wb_full_data <- wb_full_data %>%
  rename(ccode = cown)
rm(wb_data)
rm(ccode_dta)

#Getting rid of WB data that isnt for countries 
library(tidyr)
wb_full_data <- wb_full_data[!is.na(wb_full_data$ccode), ]
#removing duplicates 
wb_full_data <- unique(wb_full_data)
#there were no duplicates 
pgms <- unique(pgms)



# Merging Data ------------------------------------------------------------

#Putting all of this into one DF 
merging <- left_join(cow_df, coups, by = c("ccode" = "ccode", 
                                           "year" = "year"))
merging <- unique(merging)
summary(merging)
merging2 <- left_join(merging, pgms, by = c("gwcode" = "gwno", 
                                            "year" = "year"))
summary(merging2)
unique(merging2)


#Merge with epr 
merging2 <- left_join(merging2, 
                       collapsed_epr,
                       by = c("gwcode" = "gwid", 
                              "year" = "year"))
rm(collapsed_epr)

#Final merge 
full_data <- left_join(merging2,
                       wb_full_data, 
                       by = c("ccode" = "ccode", 
                               "year" = "year"))
full_data <- unique(full_data)

#removing everything except the final dataframe
rm(coups, cow_df, merging, merging2, pgms, wb_full_data)


#Now I need to do a little work on this DF, changing names, fixing variables, etc. 

# Cleaning Data -----------------------------------------------------------

#just making sure the temporal bounds are set correctedly 
full_data <- full_data %>%
  filter(year >= 1981 & year <= 2014)

#Some name changes 
full_data <- full_data %>%
  rename(gdppc = wbgdppc2011est) %>%
  rename(state = statenme) %>%
  rename(polity = polity2) %>%
  rename(rugged = rugged) %>%
  rename(maj_power = cowmaj) %>%
  rename(gpd = wbgdp2011est) %>%
  rename(pop = wbpopest) %>%
  rename(mil_expend = milex) %>%
  rename(cinc = cinc) %>%
  rename(cw_onset = ucdponset) %>%
  rename(peace_years = ts_war) %>%
  rename(coup = coup) %>%
  rename(pgm_count = presence_count) %>%
  rename(govt_pgm_count = presence_gov_formed_count) %>%
  rename(notgovt_pgm_count = presence_notgov_formed_count) %>%
  rename(pgm_presence = presence) %>%
  rename(govt_pgm_presence = presence_govformed) %>%
  rename(not_govt_pgm_presence = presence_notgovformed) %>%
  rename(ongoing_conflict = ucdpongoing) %>% 
  rename(natl_rents = `Total natural resources rents `) %>%
  rename(school_rates = `primary school enrollment`) %>%
  rename(population_density = `population density`)

#Now I need to fix up the coup variable#first thing I need to do is change all the NAs to 0
full_data <- full_data %>% 
  mutate(coup = ifelse(is.na(coup), 0, coup))

#time since variable coup
#library(stevemisc)
#fulldata <- ps_btscs(full_data, coup, year, ccode)
#summary(fulldata$spell)
#fulldata <- fulldata %>%
 # rename(ts_coup = spell)


#coldwar dummy
full_data$post_cw <- ifelse(full_data$year > 1991, 1, 0)

#I need to create logs and lags of certain variables 

# Define a small constant value - need to create logs for some variables with 0's
epsilon <- 1e-6  # You can adjust this value as needed
# Create log_rents with handling for zero values
full_data$log_rents <- log(full_data$natl_rents + epsilon)
full_data$log_milex <- log(full_data$mil_expend + epsilon)

#logs
full_data <- full_data |>
  mutate(log_gdp = log(gpd)) |>
  mutate(log_pop = log(population)) |>
  mutate(log_vdem = log(v2x_polyarchy)) |>
  mutate(polity_sq = polity^2) |>
  mutate(peace_years_sq = peace_years^2) |>
  mutate(peace_years_cb = peace_years^3) |>
  mutate(Vdem_sq = v2x_polyarchy^2) |>
  mutate(log_rugged = log(rugged)) 




#lags
full_data <- full_data %>%
  group_by(ccode) %>%
  mutate(lag_presence = lag(pgm_presence, 1)) %>%
  mutate(lag_count = lag(pgm_count, 1)) %>%
  mutate(lag_govt_pgm = lag(govt_pgm_presence, 1)) %>%
  mutate(lag_not_govt_pgm = lag(not_govt_pgm_presence, 1)) %>%  
  mutate(lag_govt_pgm_count = lag(govt_pgm_count, 1)) %>%
  mutate(lag_not_govt_pgm_count = lag(notgovt_pgm_count, 1)) 

#making the controls make sense 
full_data <- full_data |>
  mutate(mil_ex_2 = mil_expend/10000000)




#making sure there are no duplicates
duplicate_rows <- duplicated(full_data[c("ccode", "year")])
# Remove duplicate observations and overwrite the original dataframe
full_data <- full_data[!duplicate_rows, ]
#there were none, so all good 
  
#saving the dataframe 
save(full_data,
     file = "/Users/benjaminleo/Downloads/PGMs_and_CW_onset_Empirics795_cleaned_data.Rdata")

# Saving the Data ---------------------------------------------------------


#install.packages("foreign")
library(foreign)
write.dta(full_data, "pgm_cw_full.dta")

#Save as a csv
library(readr)
write_csv(full_data,
          "/Users/benjaminleo/Downloads/PGMs_and_CW_onset_Empirics795_cleaned_data.csv")


# Graphics  ---------------------------------------------------------------



#Making a PGM Map for the intro section - before I start my analyses I want to make a nice map showing the number of PGMs in each country 
#install.packages("dataverse")
library(dataverse)
#install.packages("bit")
library(bit)
Sys.setenv("DATAVERSE_SERVER" = "dataverse.harvard.edu")
pgm_group_df <- get_dataframe_by_name(
  filename = "pgmdv2_group.tab",
  dataset = "doi:10.7910/DVN/YK8L4I",
  server = "dataverse.harvard.edu"
)
View(pgm_group_df)

# Assuming pgm_group_df is your dataframe
pgm_group_df <- pgm_group_df %>%
  group_by(gwno, country) %>%
  summarise(total_groups = n())

#making a blank world map 
# Load necessary libraries
library(maps)
library(countrycode)
# Get world map data
world_map <- map_data("world")


#Adding gwno to world map data 
world_map$gwno <- countrycode(world_map$region, origin = "country.name", destination = "gwn")
#merging
merged_df <- left_join(world_map, pgm_group_df, by = "gwno")

#plot
ggplot() +
  geom_polygon(data = merged_df, aes(x = long, y = lat, 
                                     group = group, 
                                     fill = total_groups),
                                    color = "black") +
  scale_fill_gradient(name = "Total Groups", 
                      low = "white", 
                      high = "black", 
                      na.value = "white") +
  coord_fixed() +
  theme_void() + 
  labs(
    title = "Global Distribution of PGMs (1981-2014)"
  )

rm(merged_df, world_map, pgm_group_df)


#Making a map of cw onset
# Aggregate to find the sum of 'cw_onset' by 'ccode'
collapsed_data <- aggregate(cw_onset ~ gwcode,
                            data = full_data, sum)
# Rename the aggregated column for clarity
names(collapsed_data)[2] <- "sum_cw_onset"
# Print the resulting dataframe
View(collapsed_data)
#making a blank world map 
# Load necessary libraries
library(maps)
library(countrycode)
# Get world map data
world_map <- map_data("world")

#Adding gwno to world map data 
world_map$gwno <- countrycode(world_map$region, origin = "country.name", destination = "gwn")
#merging
merged_df <- left_join(world_map, collapsed_data, by = c("gwno" = "gwcode"))

#plot
ggplot() +
  geom_polygon(data = merged_df, aes(x = long, 
                                     y = lat, 
                                     group = group, 
                                     fill = sum_cw_onset), 
                                    color = "black") +
  scale_fill_gradient(name = "Number of Civil Conflicts", 
                      low = "white", 
                      high = "black", 
                      na.value = "white") +
  coord_fixed() +
  theme_void() + 
  labs(
    title = "Global Distribution of Civil Conflicts (1981-2014)"
  )


#conflict over time plot 

#count df
collapsed_conflict <- full_data %>%
  group_by(year) %>%
  summarise(cc_onset_count = sum(cw_onset, 
                                     na.rm = TRUE), 
            .groups = 'drop')

#Creating the plot 
ggplot(collapsed_conflict, aes(x = year, y = cc_onset_count)) +
  geom_line() +
  labs(x = "Year", y = "Number of New Civil Conflicts") +
  ggtitle("Civil Conflict Onset by Year (1981-2014)") +
  scale_y_continuous(breaks = seq(0, max(collapsed_conflict$cc_onset_count), by = 1)) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

#PGMs over time plot 

#count df
collapsed_PGMs <- full_data %>%
  group_by(year) %>%
  summarise(pgm_count = sum(pgm_count, 
                                 na.rm = TRUE), 
            .groups = 'drop')

#Creating the plot 
ggplot(collapsed_PGMs, aes(x = year, y = pgm_count)) +
  geom_line() +
  labs(x = "", y = "Number of Active PGMs") +
  ggtitle("PGM Presence by Year (1981-2014)") +
  scale_y_continuous(breaks = seq(0, max(collapsed_conflict$cc_onset_count), by = 1)) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())

#States with a PGM active by year  

#count df
collapsed_PGMs <- full_data %>%
  group_by(year) %>%
  summarise(pgm_presence = sum(pgm_presence, 
                            na.rm = TRUE), 
            .groups = 'drop')

#Creating the plot 
ggplot(collapsed_PGMs, aes(x = year, y = pgm_presence)) +
  geom_line() +
  labs(x = "", y = "Number of States with a PGM") +
  ggtitle("Number of States With a PGM (1981-2014)") +
  scale_y_continuous(breaks = seq(0, max(collapsed_PGMs$year), by = 5)) +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank())
